from transformers import TrainingArguments, Trainer # 导入trainer相关的包
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import os
dataset_dir = "/data/datasets/SimCLUE/datasets/train_pair_1w.json"
dataset = load_dataset('json', data_files=dataset_dir,split='train')
datasets = dataset.train_test_split(test_size=0.2)
tokenizer = AutoTokenizer.from_pretrained('/data/models/huggingface/chinese-macbert-base')
def process_function(examples):
    tokenizer_examples = tokenizer(examples['sentence1'],examples['sentence2'], truncation=True, max_length=128)
    tokenizer_examples["labels"] = [float(label) for label in examples['label']]
    return tokenizer_examples

tokenizer_datasets = datasets.map(process_function, batched=True,remove_columns=datasets['train'].column_names)
print(tokenizer_datasets['train'][0:2])